In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
train = pd.read_csv("house_train.csv")
test  = pd.read_csv("house_test.csv")
In [3]:
train.head()
Out[3]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

In [4]:
train.shape
Out[4]:
(1460, 81)
In [5]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [6]:
len(train[train.duplicated()])
# There is no duplicated rows 
Out[6]:
0
In [7]:
na = train.columns[train.isnull().any()]
In [8]:
#columns which are having the null values
na
Out[8]:
Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')
In [9]:
null = train[na].isnull().sum()
In [10]:
# The sum of null values of each feature
null
Out[10]:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
In [11]:
plt.figure(figsize=(25,12))
sns.heatmap(train.isnull())  
Out[11]:
<AxesSubplot:>
In [12]:
train1 = train.drop(["Id","Alley","FireplaceQu","PoolQC","Fence","MiscFeature"],axis=1)
# Drop those features are not important to predict the output because it consists lot's of null values
In [13]:
train1.shape
Out[13]:
(1460, 75)
In [14]:
sns.pairplot(train1)
Out[14]:
<seaborn.axisgrid.PairGrid at 0x20ae23b5a00>
In [14]:
train1["MasVnrType"].value_counts()
Out[14]:
None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64
In [15]:
len(train1[train1["MasVnrType"]=="None"])
Out[15]:
864
In [16]:
train1["MasVnrType"].fillna("None",inplace=True)
In [17]:
train1["MasVnrType"].head(50)
Out[17]:
0     BrkFace
1        None
2     BrkFace
3        None
4     BrkFace
5        None
6       Stone
7       Stone
8        None
9        None
10       None
11      Stone
12       None
13      Stone
14    BrkFace
15       None
16    BrkFace
17       None
18       None
19       None
20    BrkFace
21       None
22    BrkFace
23       None
24       None
25      Stone
26       None
27      Stone
28       None
29       None
30       None
31       None
32       None
33       None
34    BrkFace
35      Stone
36       None
37    BrkFace
38       None
39       None
40    BrkFace
41       None
42       None
43       None
44       None
45    BrkFace
46       None
47       None
48       None
49       None
Name: MasVnrType, dtype: object
In [18]:
train1[train1["MasVnrArea"].isna()]
Out[18]:
MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities LotConfig LandSlope ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
234 60 RL NaN 7851 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 5 2010 WD Normal 216500
529 20 RL NaN 32668 Pave IR1 Lvl AllPub CulDSac Gtl ... 200 0 0 0 0 3 2007 WD Alloca 200624
650 60 FV 65.0 8125 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 5 2008 WD Normal 205950
936 20 RL 67.0 10083 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 8 2009 WD Normal 184900
973 20 FV 95.0 11639 Pave Reg Lvl AllPub Corner Gtl ... 0 0 0 0 0 12 2008 New Partial 182000
977 120 FV 35.0 4274 Pave IR1 Lvl AllPub Inside Gtl ... 0 0 0 0 0 11 2007 New Partial 199900
1243 20 RL 107.0 13891 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 9 2006 New Partial 465000
1278 60 RL 75.0 9473 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 3 2008 WD Normal 237000

8 rows × 75 columns

In [19]:
len(train1[train1["MasVnrArea"]==0])
Out[19]:
861
In [20]:
mean_area = train1["MasVnrArea"].mean()
mean_area
Out[20]:
103.68526170798899
In [21]:
median_area = train1["MasVnrArea"].median()
median_area
Out[21]:
0.0
In [22]:
train1["MasVnrArea"].fillna(median_area,inplace=True)
# filled the null values with mode because more than half the constructer follwed it
In [23]:
len(train1[train1["MasVnrArea"].isna()])
Out[23]:
0
In [24]:
train1["BsmtQual"].value_counts()
Out[24]:
TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64
In [25]:
train1["BsmtQual"].fillna("TA",inplace=True)
In [26]:
train2 = train1.copy()
In [27]:
train2
Out[27]:
MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities LotConfig LandSlope ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 60 RL 65.0 8450 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 2 2008 WD Normal 208500
1 20 RL 80.0 9600 Pave Reg Lvl AllPub FR2 Gtl ... 0 0 0 0 0 5 2007 WD Normal 181500
2 60 RL 68.0 11250 Pave IR1 Lvl AllPub Inside Gtl ... 0 0 0 0 0 9 2008 WD Normal 223500
3 70 RL 60.0 9550 Pave IR1 Lvl AllPub Corner Gtl ... 272 0 0 0 0 2 2006 WD Abnorml 140000
4 60 RL 84.0 14260 Pave IR1 Lvl AllPub FR2 Gtl ... 0 0 0 0 0 12 2008 WD Normal 250000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 60 RL 62.0 7917 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 8 2007 WD Normal 175000
1456 20 RL 85.0 13175 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 2 2010 WD Normal 210000
1457 70 RL 66.0 9042 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 2500 5 2010 WD Normal 266500
1458 20 RL 68.0 9717 Pave Reg Lvl AllPub Inside Gtl ... 112 0 0 0 0 4 2010 WD Normal 142125
1459 20 RL 75.0 9937 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 6 2008 WD Normal 147500

1460 rows × 75 columns

In [28]:
len(train2[train2["BsmtQual"].isna()])
Out[28]:
0
In [29]:
train2["BsmtQual"].value_counts()
Out[29]:
TA    686
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64
In [30]:
train2["BsmtCond"].value_counts()
Out[30]:
TA    1311
Gd      65
Fa      45
Po       2
Name: BsmtCond, dtype: int64
In [31]:
train2["BsmtCond"].fillna("TA",inplace=True)
In [32]:
train3 = train2.copy()
In [33]:
train3["BsmtExposure"].value_counts()
Out[33]:
No    953
Av    221
Gd    134
Mn    114
Name: BsmtExposure, dtype: int64
In [34]:
train3["BsmtExposure"].fillna("No",inplace=True)
In [35]:
train4 = train3.copy()
In [36]:
train4["Electrical"].value_counts()
Out[36]:
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64
In [37]:
train4["Electrical"].fillna("SBrkr",inplace=True)
In [38]:
train5 = train4.copy()
In [39]:
train5["GarageType"].value_counts()
Out[39]:
Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64
In [40]:
train5["GarageType"].fillna("Attchd",inplace=True)
In [41]:
train6 =train5.copy()
In [42]:
train6["GarageYrBlt"].value_counts()
Out[42]:
2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
          ..
1927.0     1
1900.0     1
1906.0     1
1908.0     1
1933.0     1
Name: GarageYrBlt, Length: 97, dtype: int64
In [43]:
train6["GarageYrBlt"].median()
Out[43]:
1980.0
In [44]:
train6["GarageYrBlt"].mean()
Out[44]:
1978.5061638868744
In [45]:
train6["GarageYrBlt"].mode()
Out[45]:
0    2005.0
Name: GarageYrBlt, dtype: float64
In [46]:
len(train6[train6["GarageYrBlt"]==1980])
Out[46]:
15
In [47]:
len(train6[train6["GarageYrBlt"]>=2000])
Out[47]:
402
In [48]:
train6["GarageYrBlt"].fillna(2000,inplace=True)
In [49]:
train7 = train6.copy()
In [50]:
train7["GarageFinish"].value_counts()
Out[50]:
Unf    605
RFn    422
Fin    352
Name: GarageFinish, dtype: int64
In [51]:
train7["GarageFinish"].fillna("RFn",inplace=True)
In [52]:
train8 = train7.copy()
In [53]:
train8["GarageQual"].value_counts()
Out[53]:
TA    1311
Fa      48
Gd      14
Ex       3
Po       3
Name: GarageQual, dtype: int64
In [54]:
train8["GarageQual"].fillna("TA",inplace=True)
In [55]:
train8["GarageCond"].value_counts()
Out[55]:
TA    1326
Fa      35
Gd       9
Po       7
Ex       2
Name: GarageCond, dtype: int64
In [56]:
train9 = train8.copy()
In [57]:
train9["GarageCond"].fillna("TA",inplace=True)
In [58]:
train10 = train9.copy()
In [59]:
train10
Out[59]:
MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities LotConfig LandSlope ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 60 RL 65.0 8450 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 2 2008 WD Normal 208500
1 20 RL 80.0 9600 Pave Reg Lvl AllPub FR2 Gtl ... 0 0 0 0 0 5 2007 WD Normal 181500
2 60 RL 68.0 11250 Pave IR1 Lvl AllPub Inside Gtl ... 0 0 0 0 0 9 2008 WD Normal 223500
3 70 RL 60.0 9550 Pave IR1 Lvl AllPub Corner Gtl ... 272 0 0 0 0 2 2006 WD Abnorml 140000
4 60 RL 84.0 14260 Pave IR1 Lvl AllPub FR2 Gtl ... 0 0 0 0 0 12 2008 WD Normal 250000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 60 RL 62.0 7917 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 8 2007 WD Normal 175000
1456 20 RL 85.0 13175 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 2 2010 WD Normal 210000
1457 70 RL 66.0 9042 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 2500 5 2010 WD Normal 266500
1458 20 RL 68.0 9717 Pave Reg Lvl AllPub Inside Gtl ... 112 0 0 0 0 4 2010 WD Normal 142125
1459 20 RL 75.0 9937 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 6 2008 WD Normal 147500

1460 rows × 75 columns

In [60]:
train10.columns[train10.isnull().any()]
Out[60]:
Index(['LotFrontage', 'BsmtFinType1', 'BsmtFinType2'], dtype='object')
In [61]:
train10 .shape
Out[61]:
(1460, 75)
In [62]:
train10.dropna(axis=0,inplace=True)
In [63]:
train10 .shape
Out[63]:
(1169, 75)
In [64]:
train10.columns[train10.isnull().any()]
Out[64]:
Index([], dtype='object')
In [65]:
def remove_outliers(train10, columns):
    # Create a boxplot of the DataFrame
    train10.boxplot(column=columns)
    
    # Calculate the upper and lower bounds of the boxplot for each column
    for col in columns:
        q1 = train10[col].quantile(0.25)
        q3 = train10[col].quantile(0.75)
        iqr = q3 - q1
        upper_bound = q3 + (1.5 * iqr)
        lower_bound = q1 - (1.5 * iqr)
        
        # Remove the outlier rows from the DataFrame
        train10 = train10[(train10[col] <= upper_bound) & (train10[col] >= lower_bound)]
    
    # Create a new boxplot of the DataFrame without the outliers
    train10.boxplot(column=columns)
    
    # Return the DataFrame without the outliers
    return train10
In [66]:
train11= remove_outliers(train10,['MSSubClass','LotFrontage','LotArea',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd','MasVnrArea',
       'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF','1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr','TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'])
       
In [67]:
train11.shape
Out[67]:
(461, 75)
In [68]:
train11["YearRemodAdd"].value_counts()
Out[68]:
2006    54
2007    40
1950    36
2005    34
2004    28
2008    19
2003    18
2000    18
2002    14
2009    12
1995    12
1999    11
1998    10
1992     8
1954     8
1997     7
1994     7
1996     6
1977     6
1972     6
1956     5
1957     5
1965     5
1962     5
2001     5
1990     5
1993     5
1959     4
1970     4
1976     4
1958     4
1969     4
1963     3
1978     3
1974     3
1967     3
1985     3
1971     3
1966     3
1960     3
1968     3
2010     2
1989     2
1975     2
1953     2
1955     2
1951     2
1982     2
1979     2
1980     2
1984     2
1952     1
1964     1
1973     1
1961     1
1991     1
Name: YearRemodAdd, dtype: int64
In [69]:
train11
Out[69]:
MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities LotConfig LandSlope ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 60 RL 65.0 8450 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 2 2008 WD Normal 208500
2 60 RL 68.0 11250 Pave IR1 Lvl AllPub Inside Gtl ... 0 0 0 0 0 9 2008 WD Normal 223500
4 60 RL 84.0 14260 Pave IR1 Lvl AllPub FR2 Gtl ... 0 0 0 0 0 12 2008 WD Normal 250000
6 20 RL 75.0 10084 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 8 2007 WD Normal 307000
10 20 RL 70.0 11200 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 2 2008 WD Normal 129500
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1444 20 RL 63.0 8500 Pave Reg Lvl AllPub FR2 Gtl ... 0 0 0 0 0 11 2007 WD Normal 179600
1448 50 RL 70.0 11767 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 5 2007 WD Normal 112000
1451 20 RL 78.0 9262 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 5 2009 New Partial 287090
1454 20 FV 62.0 7500 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 10 2009 WD Normal 185000
1455 60 RL 62.0 7917 Pave Reg Lvl AllPub Inside Gtl ... 0 0 0 0 0 8 2007 WD Normal 175000

461 rows × 75 columns

In [70]:
train12 = train11.drop(["MiscVal","PoolArea","ScreenPorch","3SsnPorch","EnclosedPorch","KitchenAbvGr","BsmtHalfBath","LowQualFinSF","BsmtFinSF2"],axis=1)
In [71]:
train12.shape
Out[71]:
(461, 66)
In [72]:
sns.pairplot(train12)
Out[72]:
<seaborn.axisgrid.PairGrid at 0x15ca6e82460>
In [73]:
train12["SaleCondition"].value_counts()
Out[73]:
Normal     356
Partial     71
Abnorml     27
Family       5
AdjLand      1
Alloca       1
Name: SaleCondition, dtype: int64
In [74]:
train13 = train12.drop(['Street','Utilities','LandSlope','Condition2','RoofMatl','BsmtFinSF1','BsmtFinType2','BsmtUnfSF','Heating','CentralAir','Electrical','GarageQual','GarageCond','PavedDrive'],axis=1)
In [75]:
train13.shape
Out[75]:
(461, 52)
In [76]:
train13.columns
Out[76]:
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape',
       'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold',
       'SaleType', 'SaleCondition', 'SalePrice'],
      dtype='object')
In [77]:
train14 = train13.copy()
In [78]:
train15=pd.get_dummies(train14[["MSZoning","LandContour","LotConfig","Neighborhood","Condition1","BldgType","HouseStyle","RoofStyle","Exterior1st","Exterior2nd","MasVnrType","Foundation","Functional","GarageType","SaleType","SaleCondition"]])
In [79]:
train16 = pd.concat([train14,train15],axis=1)
In [80]:
train17 = train16.drop(["MSZoning","LandContour","LotConfig","Neighborhood","Condition1","BldgType","HouseStyle","RoofStyle","Exterior1st","Exterior2nd","MasVnrType","Foundation","Functional","GarageType","SaleType","SaleCondition"],axis=1)
In [81]:
train17.shape
Out[81]:
(461, 145)
In [82]:
train17
Out[82]:
MSSubClass LotFrontage LotArea LotShape OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea ExterQual ... SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 60 65.0 8450 Reg 7 5 2003 2003 196.0 Gd ... 0 0 0 1 0 0 0 0 1 0
2 60 68.0 11250 IR1 7 5 2001 2002 162.0 Gd ... 0 0 0 1 0 0 0 0 1 0
4 60 84.0 14260 IR1 8 5 2000 2000 350.0 Gd ... 0 0 0 1 0 0 0 0 1 0
6 20 75.0 10084 Reg 8 5 2004 2005 186.0 Gd ... 0 0 0 1 0 0 0 0 1 0
10 20 70.0 11200 Reg 5 5 1965 1965 0.0 TA ... 0 0 0 1 0 0 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1444 20 63.0 8500 Reg 7 5 2004 2004 106.0 Gd ... 0 0 0 1 0 0 0 0 1 0
1448 50 70.0 11767 Reg 4 7 1910 2000 0.0 TA ... 0 0 0 1 0 0 0 0 1 0
1451 20 78.0 9262 Reg 8 5 2008 2009 194.0 Gd ... 0 1 0 0 0 0 0 0 0 1
1454 20 62.0 7500 Reg 7 5 2004 2005 0.0 Gd ... 0 0 0 1 0 0 0 0 1 0
1455 60 62.0 7917 Reg 6 5 1999 2000 0.0 TA ... 0 0 0 1 0 0 0 0 1 0

461 rows × 145 columns

In [83]:
train17["LotShape"].value_counts()
Out[83]:
Reg    305
IR1    151
IR2      3
IR3      2
Name: LotShape, dtype: int64
In [84]:
train17["LotShape"].replace({"Reg":4,"IR1":3,"IR2":2,"IR3":1},inplace=True)
In [85]:
train18 = train17.copy()
In [86]:
train18["ExterCond"].replace({"Fa":1,"TA":2,"Gd":3},inplace=True)
In [87]:
train18["ExterCond"].value_counts()
Out[87]:
2    431
3     27
1      3
Name: ExterCond, dtype: int64
In [88]:
train19 =train18.copy()
In [89]:
train19["BsmtQual"].replace({"Fa":1,"TA":2,"Gd":3,"Ex":4},inplace=True)
In [90]:
train20 = train19.copy()
In [91]:
train19["BsmtCond"].replace({"Fa":1,"TA":2,"Gd":3},inplace=True)
In [92]:
train20 = train19.copy()
In [93]:
train20["BsmtExposure"].replace({"No":1,"Mn":2,"Av":3,"Gd":4},inplace=True)
In [94]:
train21 = train20.copy()
In [95]:
train21["BsmtFinType1"].replace({"GLQ":6,"ALQ":5,"BLQ":4,"Rec":3,"LwQ":2,"Unf":1},inplace=True)
In [96]:
train22 = train21.copy()
In [97]:
train22["HeatingQC"].replace({"Fa":1,"TA":2,"Gd":3,"Ex":4},inplace=True)
In [98]:
train23 = train22.copy()
In [99]:
train23["KitchenQual"].replace({"Fa":1,"TA":2,"Gd":3,"Ex":4},inplace=True)
In [100]:
train24 = train23.copy()
In [101]:
train24["GarageFinish"].replace({"Unf":1,"RFn":2,"Fin":3},inplace=True)
In [102]:
train25 = train24.copy()
In [103]:
train25.shape
Out[103]:
(461, 145)
In [104]:
train25["ExterQual"].value_counts()
Out[104]:
TA    227
Gd    224
Ex      9
Fa      1
Name: ExterQual, dtype: int64
In [105]:
train25["ExterQual"].replace({"Fa":1,"TA":2,"Gd":3,"Ex":4},inplace=True)
In [106]:
train26 = train25.copy()
In [107]:
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
In [108]:
data =pd.DataFrame(minmax_scale(train25),columns=train25.columns)
In [109]:
data
Out[109]:
MSSubClass LotFrontage LotArea LotShape OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea ExterQual ... SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 0.4 0.426829 0.411622 1.000000 0.625 0.333333 0.939394 0.883333 0.450575 0.666667 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
1 0.4 0.463415 0.623487 0.666667 0.625 0.333333 0.919192 0.866667 0.372414 0.666667 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
2 0.4 0.658537 0.851241 0.666667 0.750 0.333333 0.909091 0.833333 0.804598 0.666667 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
3 0.0 0.548780 0.535260 1.000000 0.750 0.333333 0.949495 0.916667 0.427586 0.666667 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
4 0.0 0.487805 0.619703 1.000000 0.375 0.333333 0.555556 0.250000 0.000000 0.333333 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
456 0.0 0.402439 0.415406 1.000000 0.625 0.333333 0.949495 0.900000 0.243678 0.666667 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
457 0.3 0.487805 0.662606 1.000000 0.250 1.000000 0.000000 0.833333 0.000000 0.333333 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
458 0.0 0.585366 0.473063 1.000000 0.750 0.333333 0.989899 0.983333 0.445977 0.666667 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
459 0.0 0.390244 0.339740 1.000000 0.625 0.333333 0.949495 0.916667 0.000000 0.666667 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
460 0.4 0.390244 0.371292 1.000000 0.500 0.333333 0.898990 0.833333 0.000000 0.333333 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0

461 rows × 145 columns

In [110]:
y = train26["SalePrice"]
y
Out[110]:
0       208500
2       223500
4       250000
6       307000
10      129500
         ...  
1444    179600
1448    112000
1451    287090
1454    185000
1455    175000
Name: SalePrice, Length: 461, dtype: int64
In [111]:
x_train,x_test,y_train,y_test = train_test_split(data,y,test_size=0.2,random_state=3)
In [112]:
train26.shape
Out[112]:
(461, 145)
In [113]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
In [114]:
rfr = RandomForestRegressor(random_state=42)
In [115]:
rfr = RandomForestRegressor(max_depth = 7, min_samples_leaf= 5, min_samples_split= 10, n_estimators= 50)
In [116]:
rfr = rfr.fit(x_train, y_train)
In [117]:
y_train_preds = rfr.predict(x_train)
y_train_preds
Out[117]:
array([175499.72189198, 155123.65331599, 196222.24235592, 114893.56869234,
       104540.72752359, 184291.05114593, 283330.17086688, 184858.91589257,
       227153.56390876, 109684.32633356, 108055.90153624, 108215.23390254,
       200203.03413307, 132661.32770188, 208578.71657455, 196222.24235592,
       147344.12374292, 132713.52214633, 206616.47854758, 195396.53516461,
       184276.70789196, 128895.88641946, 125938.22364547, 219361.561076  ,
       186028.40289017, 145303.71716339, 254151.47117094, 119884.04210703,
       289647.43465043, 109194.19707637, 206616.47854758, 151930.0798396 ,
       310984.6527409 , 139974.98124431, 202841.51822672, 142566.33274218,
       318803.12329757, 231968.72802731, 136482.0827917 , 277326.62724018,
        74004.64156751, 268443.44817041, 227153.56390876,  74004.64156751,
       264217.3185088 , 268158.19517896, 214652.67317517, 117922.83261011,
       113408.91053983, 192732.60161273, 144023.12577203, 126696.05817238,
       158110.00243506, 103386.99772805, 109785.69574192, 189231.21881958,
       226861.38519081, 203585.35093211, 164667.20773449, 236567.44080952,
       279826.61574739, 139894.56679987, 123782.30004587, 114893.56869234,
       231753.29845466, 289244.15949892, 112727.85933237, 230014.72282647,
       123512.75459132, 180062.61449082, 196771.1676741 , 235717.54595788,
       110191.529666  , 214471.99746089, 318774.28188342, 185940.14699369,
       139974.98124431, 119955.74210703, 142296.34391434, 235799.98437057,
       134637.71023163,  98225.43220418, 192845.2157794 , 166919.70618493,
       284611.63049459, 173435.20769675, 360432.15820698, 278943.12019184,
       102470.62391686, 183718.89133904, 180387.36299299, 225798.654113  ,
       129391.50575451, 123319.29030561, 215381.66775458, 193664.95632924,
       369432.68940984,  74004.64156751, 179204.41735028, 139977.58735438,
       369432.68940984, 153103.20796426, 130428.99787321, 159014.89458514,
       185883.35135877, 132713.52214633, 189173.42336503, 286920.06429257,
       180144.00639558, 214032.68238152, 156200.80595058, 159439.12514069,
       189819.45838691, 311583.86910454, 183320.43371105, 165451.62423132,
       145146.22217782, 101042.44702126, 134637.71023163, 175499.72189198,
       145062.50789211, 250103.12590841, 175713.38028692, 164744.55217893,
       230014.72282647, 150340.99353147, 109594.78277489, 368833.47304621,
       175227.44803552, 159439.12514069, 262064.76772369, 232887.82614347,
       192801.54911273, 186190.80458569, 230014.72282647, 192473.26233262,
       116481.10625789, 256811.99563126, 139170.6025568 , 230014.72282647,
       178624.14670169, 146901.94913975, 249927.01257507, 200259.59413307,
       166868.82364524, 154887.14447761, 186757.57081668, 196395.49752259,
       200203.03413307, 202819.72591903, 101098.61368792, 369432.68940984,
       327272.89803375, 156907.59426587, 197472.39703399, 101708.02022638,
       119884.04210703,  76689.58547661, 189819.45838691, 100604.6487906 ,
       108454.08199223, 126491.86372794, 232813.19503236, 223973.79751863,
       175713.38028692, 291938.47484161, 194782.38592481, 196771.1676741 ,
       151982.18052503, 312652.39938231, 222630.87616302, 184836.43951895,
       137085.32201588, 184828.46535145, 361031.37457062, 178538.07021817,
       124014.90610648, 192732.60161273, 236737.26392063, 184899.11722735,
       175767.88695358, 168804.38626374, 278474.3672878 , 145605.27571318,
        74004.64156751, 128858.05308613, 142499.0713136 , 289244.15949892,
       256526.01153602, 139974.98124431, 149998.40384893, 253998.26412224,
       144087.87324678, 238762.42028947, 264513.52110753, 312053.18301868,
       319402.3396612 , 338298.49215208, 189175.42336503, 173942.34885559,
       194660.91389306, 207975.56155291, 100375.38058547, 153489.37847652,
       289244.15949892, 114893.56869234, 215154.40296382, 186641.43564352,
       208166.98536243, 138051.66847009, 153539.32638417,  75539.60656751,
       200259.59413307, 230283.00610785, 132668.85547966, 124297.78632626,
       134445.8761768 , 197617.65351018, 113331.61256003, 238762.42028947,
       176283.11366687, 114893.56869234, 164050.93511544, 126676.05817238,
       128024.92666223, 179845.56356441, 367217.38425833, 126696.05817238,
       200259.59413307, 154900.06164932, 151936.94996947, 100375.38058547,
       327272.89803375, 338298.49215208, 238762.42028947, 167054.8776135 ,
       166868.82364524, 139620.20468085, 214603.43317517, 208578.71657455,
       124297.78632626, 109612.59420346, 282606.7555386 , 165588.15089799,
       175713.38028692, 134445.8761768 , 128004.92666223, 140592.34057727,
       219340.301076  , 224558.36285773, 199087.96394237, 136869.71543163,
       113125.13559755, 216347.52791331, 128915.88641946, 187263.33567886,
       173942.34885559, 181427.76706871, 262364.49572369, 180144.00639558,
       141541.00072583, 179770.94811997, 250224.49146396, 231968.72802731,
       128671.46578454, 249740.47057507, 129806.01399627, 173466.03436341,
       229940.58854075, 200515.15070882, 215077.30884617, 207367.06380399,
       180387.36299299, 281078.36216742, 120133.96432925, 247096.80841951,
       277326.62724018, 202898.07822672, 189734.12505358, 119955.74210703,
       114893.56869234, 238762.42028947, 136744.12289195, 226806.13843757,
       250318.06533698, 224950.79928977, 173942.34885559, 238762.42028947,
       259884.8550409 , 188948.57718899, 189777.79172025, 129916.25209151,
       219340.301076  , 109753.0885096 , 226861.38519081, 202819.72591903,
       193694.19716257, 194411.63650601, 188955.49385565, 147772.520782  ,
       134598.8768983 , 138917.18675738, 190264.60746551, 147300.98088578,
       223973.79751863, 179255.81535028, 132915.02214633, 168890.46274725,
       232698.00281013, 180387.36299299, 220193.59198509, 188081.65194054,
       119884.04210703, 157153.06093254,  75539.60656751, 208166.98536243,
       255745.5368138 , 189131.75669837, 129367.47797674, 368833.47304621,
       153403.92526585, 257997.03494078, 166313.32804085, 117128.52726643,
       129893.91875818, 184864.94089257, 118716.90952671, 158800.26074675,
       178507.2435515 ,  96666.36224692, 214603.43317517, 139170.6025568 ,
       193694.19716257, 184751.7232491 , 369432.68940984, 202696.1500857 ,
       310984.6527409 , 132668.85547966, 168259.69101565, 204953.19014498,
        74004.64156751, 108933.44266078, 214376.43238152, 144238.970469  ,
       317954.31123407,  80951.02175449, 219340.301076  , 225900.51125586,
       224880.17591315, 132705.99436855, 142566.33274218, 156160.47867785,
       208400.52107672, 226759.52804796, 108851.19923221, 144322.68475472,
       208578.71657455, 283330.17086688, 149183.41297591, 173859.49171273,
       250299.94346396, 329808.24825597, 139859.25447161, 247577.78339387,
        75539.60656751, 154884.99169983, 156124.92200397, 139237.86398537])
In [118]:
y_test_preds = rfr.predict(x_test)
y_test_preds[0:10]
Out[118]:
array([247096.80841951, 178624.14670169, 134598.8768983 , 116481.10625789,
       139170.6025568 , 214546.14746089, 236662.63280952, 207975.56155291,
       151973.22269675, 174309.08961261])
In [119]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate the regression error metrics
mae = mean_absolute_error(y_train, y_train_preds)
mse = mean_squared_error(y_train, y_train_preds)
rmse = mean_squared_error(y_train, y_train_preds, squared=False)
r2 = r2_score(y_train, y_train_preds)

    # Print the report
print(f"Mean Absolute Error: {mae:.3f}")
print(f"Mean Squared Error: {mse:.3f}")
print(f"Root Mean Squared Error: {rmse:.3f}")
print(f"R-squared: {r2:.3f}")
Mean Absolute Error: 1096.882
Mean Squared Error: 15516923.817
Root Mean Squared Error: 3939.153
R-squared: 0.996
In [120]:
r2 = r2_score(y_test, y_test_preds)
r2
Out[120]:
0.9842893356874856
In [121]:
rfr.predict(pd.DataFrame(data.iloc[0:5,:]))
Out[121]:
array([208166.98536243, 223973.79751863, 250318.06533698, 311583.86910454,
       129391.50575451])
In [122]:
a=pd.DataFrame(y_test_preds)
b=pd.DataFrame(y_test)
In [123]:
a.rename({0:"predicted"},axis=1,inplace=True)
In [128]:
a=np.round(a)
In [129]:
b.reset_index(drop=True,inplace=True)
In [130]:
pd.concat([b,a],axis=1)
Out[130]:
SalePrice predicted
0 245350 247097.0
1 178000 178624.0
2 135000 134599.0
3 116050 116481.0
4 139000 139171.0
... ... ...
88 219500 219340.0
89 176000 175713.0
90 127500 126696.0
91 119000 119239.0
92 328000 329808.0

93 rows × 2 columns

In [131]:
plt.figure(figsize=(12,8))
plt.xlabel("Actual Values")
plt.ylabel("Predicted values")
plt.title("The Scatterplot of Relationship between Actual Values and Predictions")
plt.scatter(a,b)
Out[131]:
<matplotlib.collections.PathCollection at 0x15cdfc46310>
In [ ]: